library("dplyr")
library("tidyr")
library("ggplot2")
library("GenomicRanges")
library("patchwork")

Diaz et al. DLBCL data

sample_names <- c("DLBCL", "mixA")
refs <- c("control", "mixB")
comparisons <- c(paste0(sample_names, "_vs_", refs))

chess_files <- list.files(file.path(basedir, "data", "chess", comparisons), full.names = TRUE, pattern = "25kb.txt")
names(chess_files) <- sapply(chess_files, function(path){
  parts <- tail(strsplit(path, "/")[[1]], 2)
  gsub(".txt|genome_scan", "", paste(parts, collapse = ""))
})

chess_comparisons_DLBCL <- lapply(chess_files, function(f){
  read.table(f, sep = "\t", header = TRUE) 
}) %>% bind_rows(.id = "comparison") %>%
  extract(comparison, regex = "(.*)_vs_(.*)_window([[:alnum:]]+Mb)_step([[:alnum:]]+[k|M]b)_([[:alnum:]]+kb)",
          into = c("query", "ref", "window_size", "step_size", "resolution"), 
          remove = FALSE) %>% 
  mutate(ref = factor(ref, levels = unique(refs)),
         query = factor(query, levels = unique(sample_names))) %>%
  dplyr::filter(!is.na(ssim))

# add genomic position data

pairs_files <- list.files(file.path(basedir, "data", "chess"), pattern = "hg38_pairs_.*.bedpe", full.names = TRUE)
names(pairs_files) <-  gsub(".bedpe|hg38_pairs_", "", basename(pairs_files))

comparison_regions <- lapply(pairs_files, function(f){
  read.table(f, sep = "\t", 
             col.names = c("chr", "start", "end", "chr2", "start2", "end2", 
                           "ID", "score", "strand1", "strand2")) %>% 
    dplyr::select(chr, start, end, ID)
}) %>% bind_rows(.id = "params")  %>%
  extract(params, regex = "window([[:alnum:]]+Mb)_step([[:alnum:]]+[k|M]b)", 
          into = c("window_size", "step_size"))

chess_comparisons_DLBCL <- left_join(chess_comparisons_DLBCL, comparison_regions)

Overview plots

chr_order <- paste0("chr", c(1:22, "X", "Y"))
chr_offsets <- 
  chess_comparisons_DLBCL  %>% 
  dplyr::select(chr, end) %>% 
  group_by(chr) %>% 
  summarise(length = max(end)) %>% 
  mutate(chr = factor(chr, levels = chr_order, ordered = TRUE)) %>% 
  arrange(chr) %>% 
  mutate(offset = lag(length, default = 0)) %>% 
  mutate(offset = cumsum(offset + 20e6)) %>% 
  mutate(midpoint = length/2) %>% 
  mutate(tick_pos = offset+midpoint)

to_plot <- chess_comparisons_DLBCL %>% 
  dplyr::filter(resolution=="25kb") %>% 
  mutate(chr = factor(chr, levels = chr_order, ordered = TRUE)) %>% 
  left_join(chr_offsets) %>% 
  mutate(pos = start + offset) %>% 
  mutate(category = case_when(
    z_ssim <= -1.2 & SN > 0.6 ~ "SN > 0.6 & Z-SSIM < -1.2",
    z_ssim <= -1.2  ~ "Z-SSIM < -1.2",
    TRUE ~ "Z-SSIM > -1.2"
  )) %>% 
  mutate(category = factor(category, levels = c("Z-SSIM > -1.2", "Z-SSIM < -1.2", "SN > 0.6 & Z-SSIM < -1.2"), ordered = TRUE)) %>% 
  arrange(category) %>% 
  tidyr::unite("comparison", query, ref, sep = "_vs_")

all_chrs <- to_plot %>% 
  ggplot(aes(x = pos, y = -1*z_ssim, colour = category)) +
  geom_hline(yintercept = 1.2, colour = "grey", linetype="dotted", size = 1) +
  geom_point(size = 0.8) +
  scale_colour_manual(values = c("Z-SSIM > -1.2" = "lightgrey", "Z-SSIM < -1.2" = "pink", "SN > 0.6 & Z-SSIM < -1.2" = "darkred")) +
  scale_x_continuous(breaks = chr_offsets$tick_pos, labels = chr_offsets$chr) +
  scale_y_continuous(breaks = c(-6, -4, -2, 0, 2)) +
  theme_classic(base_size = 16) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(x = "", y = "-1 x Z-normalised\nSSIM") +
  facet_grid(comparison ~.)

chr2 <- to_plot %>% 
  dplyr::filter(chr=="chr2") %>% 
  ggplot(aes(x = start, y = -1*z_ssim, colour = category)) +
  geom_hline(yintercept = 1.2, colour = "grey", linetype="dotted", size = 1) +
  geom_point(size = 0.8) +
  scale_colour_manual(values = c("Z-SSIM > -1.2" = "lightgrey", "Z-SSIM < -1.2" = "pink", "SN > 0.6 & Z-SSIM < -1.2" = "darkred")) +
  scale_x_continuous(labels = scales::unit_format(unit = "Mb", scale = 1e-6)) +
  scale_y_continuous(breaks = c(-4, -2, 0, 2)) +
  theme_classic(base_size = 16) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "chr2", y = "-1 x Z-normalised\nSSIM") +
  facet_grid(comparison ~.)
 
all_chrs

chr2

Comparing real comparison vs mixed samples

chess_comparisons_DLBCL_real <- chess_comparisons_DLBCL %>% 
  dplyr::filter(query=="DLBCL", ref == "control") %>% 
  dplyr::select(-comparison, -query, -ref)

chess_comparisons_DLBCL_mixed <- chess_comparisons_DLBCL %>% 
  dplyr::filter(query=="mixA", ref == "mixB") %>% 
  dplyr::select(-comparison, -query, -ref)

join_cols <- setdiff(colnames(chess_comparisons_DLBCL_real), c("SN", "ssim", "z_ssim"))

chess_comparisons_DLBCL_combined <- chess_comparisons_DLBCL_real %>% 
  left_join(chess_comparisons_DLBCL_mixed,by = join_cols, suffix = c("_real", "_mixed"))
ssim_corAB <- broom::tidy(cor.test(chess_comparisons_DLBCL_combined$ssim_real,
                                 chess_comparisons_DLBCL_combined$ssim_mixed, method = "pearson"))

p1 <- ggplot(chess_comparisons_DLBCL_combined, aes(x = ssim_real, y = ssim_mixed)) +
  geom_point(size = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, colour = "blue", size = 1) +
  # geom_text(data = ssim_corAB, x = 0, y = 0.95, hjust = "left",
  #           aes(label = paste0("R = ", signif(estimate, 3)))) +
  # geom_text(data = ssim_corAB, x = 0, y = 0.95, hjust = "left",
  #           aes(label = paste0("\n\np = ", format.pval(p.value, 2)))) +
  theme_classic(base_size = 10) +
  labs(x = "SSIM (DLBCL, control)", y = "SSIM (mixA, mixB)")

SN_corAB <- broom::tidy(cor.test(chess_comparisons_DLBCL_combined$SN_real,
                                 chess_comparisons_DLBCL_combined$SN_mixed, method = "pearson"))
p2 <- ggplot(chess_comparisons_DLBCL_combined, aes(x = SN_real, y = SN_mixed)) +
  geom_point(size = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = 2, colour = "blue", size = 1) +
  # geom_text(data = SN_corAB, x = 0.1, y = 0.6, hjust = "left",
  #           aes(label = paste0("R = ", signif(estimate, 3)))) +
  # geom_text(data = SN_corAB, x = 0.1, y = 0.6, hjust = "left",
  #           aes(label = paste0("\n\np = ", format.pval(p.value, 2)))) +
  theme_classic(base_size = 10) +
  labs(x = "SN (DLBCL, control)", y = "SN (mixA, mixB)")

Fig1_DLBCL_SSIM_SN_scatterplot <- p1 + p2

Fig1_DLBCL_SSIM_SN_scatterplot

ggsave(file.path(basedir, "figures/final/Fig1_DLBCL_SSIM_SN_scatterplot.pdf"))

bind_rows(SSIM = ssim_corAB, SN = SN_corAB, .id = "variable")
## # A tibble: 2 x 9
##   variable estimate statistic p.value parameter conf.low conf.high method       
##   <chr>       <dbl>     <dbl>   <dbl>     <int>    <dbl>     <dbl> <chr>        
## 1 SSIM        0.979     356.        0      5430    0.978     0.980 Pearson's pr…
## 2 SN          0.767      88.0       0      5430    0.756     0.777 Pearson's pr…
## # … with 1 more variable: alternative <chr>
ssim_wilcoxAB <- broom::tidy(wilcox.test(chess_comparisons_DLBCL_combined$ssim_real,
                                         chess_comparisons_DLBCL_combined$ssim_mixed, paired = TRUE))
SN_wilcoxAB <- broom::tidy(wilcox.test(chess_comparisons_DLBCL_combined$SN_real,
                                 chess_comparisons_DLBCL_combined$SN_mixed, paired = TRUE))

bind_rows(SSIM = ssim_wilcoxAB, SN = SN_wilcoxAB, .id = "variable")
## # A tibble: 2 x 5
##   variable statistic p.value method                                  alternative
##   <chr>        <dbl>   <dbl> <chr>                                   <chr>      
## 1 SSIM         23305       0 Wilcoxon signed rank test with continu… two.sided  
## 2 SN        14755582       0 Wilcoxon signed rank test with continu… two.sided
ssim_diff <- chess_comparisons_DLBCL_combined %>% 
  mutate(ssim_diff = ssim_real - ssim_mixed)

Fig1_DLBCL_SSIM_diff_chr2 <- ssim_diff %>% 
  dplyr::filter(chr == "chr2") %>% 
  ggplot(aes(x = start, y = ssim_diff)) +
  geom_line() +
  scale_x_continuous(labels = scales::unit_format(unit = "Mb", scale = 1e-6)) +
  theme_classic(base_size = 10) +
  labs(x = "Position on chr2", y = "SSIM (patient, control)\n- SSIM (mixed)")

# ssim_diff %>%
#   arrange(ssim_diff) %>%
#   dplyr::filter(chr == "chr2") %>%
#   head(n = 10) %>%
#   select(chr, start, end) %>%
#   unique() %>%
#   write.table(file = file.path(basedir, "data/chess/DLBCL_vs_control/chr2_top10_differences_25kb.bed"),
#               col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")

Fig1_DLBCL_SSIM_diff_chr2

ggsave(file.path(basedir, "figures/final/Fig1_DLBCL_SSIM_diff_chr2.pdf"))


chess_comparisons_DLBCL_combined_ranking <- chess_comparisons_DLBCL_combined %>% 
  mutate(rank_real = dplyr::min_rank(ssim_real),
         rank_mixed = dplyr::min_rank(ssim_mixed)) %>% 
  mutate(rank_diff_mixed = rank_real - rank_mixed)

Fig1_DLBCL_SSIM_rankdiff_chr2 <- chess_comparisons_DLBCL_combined_ranking %>% 
  dplyr::filter(chr == "chr2") %>% 
  ggplot(aes(x = start, y = rank_diff_mixed)) +
  geom_line() +
  scale_x_continuous(labels = scales::unit_format(unit = "Mb", scale = 1e-6)) +
  theme_classic(base_size = 10) +
  labs(x = "Position on chr2", y = "SSIM rank difference")

Fig1_DLBCL_SSIM_rankdiff_chr2

ggsave(file.path(basedir, "figures/final/Fig1_DLBCL_SSIM_rankdiff_chr2.pdf"))
top10_ssim_diff <- ssim_diff %>%
  arrange(ssim_diff) %>%
  dplyr::filter(chr == "chr2") %>%
  head(n = 10)

top10_ssim_rankdiff <- chess_comparisons_DLBCL_combined_ranking %>%
  arrange(rank_diff_mixed) %>%
  dplyr::filter(chr == "chr2") %>%
  head(n = 10)

top10_ssim_diff %>% 
  dplyr::filter(ID %in% top10_ssim_rankdiff$ID) %>% 
  arrange(start) %>% 
  tidyr::unite("region", chr, start, end, sep = "-") %>% 
  pull(region)
## [1] "chr2-31500000-33500000"   "chr2-86000000-88000000"  
## [3] "chr2-86500000-88500000"   "chr2-96000000-98000000"  
## [5] "chr2-104000000-106000000" "chr2-145000000-147000000"
## [7] "chr2-146000000-148000000" "chr2-182500000-184500000"
Fig1_DLBCL_SSIM_SN_scatterplot + Fig1_DLBCL_SSIM_diff_chr2 + 
  plot_layout(widths = c(1.5, 1.5, 5))

ggsave(file.path(basedir, "figures/final/Fig1_DLBCL_SSIM_diff_assembled.pdf"))


Fig1_DLBCL_SSIM_SN_scatterplot + Fig1_DLBCL_SSIM_rankdiff_chr2 + 
  plot_layout(widths = c(1.5, 1.5, 5))

ggsave(file.path(basedir, "figures/final/Fig1_DLBCL_SSIM_rankdiff_assembled.pdf"))

Of the ten 2 Mb windows with the largest SSIM differences on chr2, 6 are genuine differences. 3 are adjacent to the centromere and contain regions that have mapping artefacts and low coverage and should likely have been filtered in Hi-C processing. One region contains a genuine patient-control difference that nevertheless overlaps a likely mapping artefact.

Comparing SSIM and SN to coverage and insulation variance

insulation_score_bigwigs <- list.files(file.path(basedir, "data", "boundaries"), 
                                       full.names = TRUE, pattern = "(DLBCL_25kb|control_25kb).*bw")

names(insulation_score_bigwigs) <- gsub(".bw", "", basename(insulation_score_bigwigs))
insulation_score_list <- lapply(insulation_score_bigwigs, rtracklayer::import.bw)

regions <- makeGRangesFromDataFrame(comparison_regions, keep.extra.columns = TRUE)
start(regions) <- start(regions) + 1

comparison_regions <- as_tibble(comparison_regions)
comparison_regions$regions <- as.list(split(regions, 1:length(regions)))

get_var <- function(regions, insulation, ...){
  gr <- subsetByOverlaps(insulation, regions)
  var(gr$score, na.rm = TRUE)
}

# just using two representative insulation tracks
regions_w_var_list <- lapply(insulation_score_list[c("control_25kb_8", "DLBCL_25kb_8")], function(x){
  comparison_regions %>% 
    mutate(variance = purrr::pmap_dbl(comparison_regions, get_var, insulation = x)) %>% 
    return(.)
  })
regions_w_var <- bind_rows(regions_w_var_list, .id = "insulation")  %>% 
  dplyr::select(-regions) %>% 
  tidyr::pivot_wider(names_from = insulation, values_from = variance)

# regions_w_var %>%
#   dplyr::select(starts_with("control"), starts_with("DLBCL")) %>%
#   as.matrix() %>%
#   cor(use = "pairwise.complete.obs")

chess_comparisons_DLBCL_combined_w_var <- left_join(chess_comparisons_DLBCL_combined, 
                                                   regions_w_var)
samples <- c("DLBCL", "control")

marginals_files <- file.path(basedir, "data/hic/", samples, paste0(samples, "_25kb_marginals.bed"))
names(marginals_files) <- gsub("_marginals.bed", "", basename(marginals_files))
marginals_list <- lapply(marginals_files, function(f){
  readr::read_delim(f, delim = "\t", col_names = c("chr", "start", "end", "name", "score", "strand")) %>% 
    makeGRangesFromDataFrame(keep.extra.columns = TRUE)
})

get_coverage <- function(regions, marginals, ...){
  gr <- subsetByOverlaps(marginals, regions)
  sum(gr$score, na.rm = TRUE)
}

regions_w_coverage_list <- lapply(marginals_list, function(x){
  comparison_regions %>% 
    mutate(coverage = purrr::pmap_dbl(comparison_regions, get_coverage, marginals = x)) %>% 
    return(.)
  })
regions_w_coverage <- bind_rows(regions_w_coverage_list, .id = "sample")  %>% 
  dplyr::select(-regions) %>% 
  # mutate(coverage = log(coverage)) %>% 
  tidyr::pivot_wider(names_from = sample, values_from = coverage)

# regions_w_var %>%
#   dplyr::select(starts_with("control"), starts_with("DLBCL")) %>%
#   as.matrix() %>%
#   cor(use = "pairwise.complete.obs")

chess_comparisons_DLBCL_combined_w_var_coverage <- left_join(chess_comparisons_DLBCL_combined_w_var, 
                                                   regions_w_coverage)
chess_comparisons_DLBCL_combined_w_var_coverage <- chess_comparisons_DLBCL_combined_w_var_coverage %>% 
  dplyr::rename("insulation_variance_control" = "control_25kb_8",
                "insulation_variance_DLBCL" = "DLBCL_25kb_8",
                "coverage_control" = "control_25kb",
                "coverage_DLBCL" = "DLBCL_25kb")

head(chess_comparisons_DLBCL_combined_w_var_coverage)
##   window_size step_size resolution ID   SN_real ssim_real z_ssim_real  chr
## 1         2Mb     500kb       25kb  1 0.4104954 0.2826877 -0.70530704 chr1
## 2         2Mb     500kb       25kb  2 0.4947246 0.5421304  0.85147762 chr1
## 3         2Mb     500kb       25kb  3 0.5304248 0.4904586  0.54142104 chr1
## 4         2Mb     500kb       25kb  4 0.4247878 0.4689058  0.41209361 chr1
## 5         2Mb     500kb       25kb  5 0.4559340 0.4126787  0.07470307 chr1
## 6         2Mb     500kb       25kb  6 0.4673613 0.4436457  0.26052031 chr1
##     start     end  SN_mixed ssim_mixed z_ssim_mixed insulation_variance_control
## 1  500000 2500000 0.2281341  0.4363934  -0.24419117                   0.3982246
## 2 1000000 3000000 0.2583484  0.6382701   0.93089992                   0.4445615
## 3 1500000 3500000 0.2514610  0.6144611   0.79231209                   0.4570081
## 4 2000000 4000000 0.2485482  0.6140805   0.79009642                   0.2825730
## 5 2500000 4500000 0.2788098  0.4838057   0.03178834                   0.1896939
## 6 3000000 5000000 0.2822353  0.4836198   0.03070607                   0.1133514
##   insulation_variance_DLBCL coverage_DLBCL coverage_control
## 1                 0.2660260         339713           215010
## 2                 0.3051939         369928           237218
## 3                 0.3539829         349074           229303
## 4                 0.2570517         349493           233177
## 5                 0.2070390         339284           232132
## 6                 0.1085575         349060           241125
correlations <- chess_comparisons_DLBCL_combined_w_var_coverage %>% 
  dplyr::select(starts_with("ssim"), starts_with("SN"), 
                starts_with("insulation_variance"), starts_with("coverage")) %>%
  corrr::correlate(method = "pearson") %>% 
  dplyr::select(term, starts_with("insulation_variance"), starts_with("coverage")) %>% 
  dplyr::filter(startsWith(term, "ssim") | startsWith(term, "SN")) %>% 
  corrr::stretch()

correlations %>% 
  tidyr::extract(x, into = c("variable", "var_sample"), regex = "(.*)_(control|DLBCL)") %>% 
  tidyr::extract(y, into = c("output", "comparison"), regex = "(.*)_(real|mixed)") %>% 
  dplyr::filter(comparison=="real") %>% 
  ggplot(aes(x = output, y = var_sample, fill = r, label = signif(r, 2))) +
  geom_tile() +
  geom_text() +
  facet_wrap(~variable,nrow = 2, strip.position = "left") +
  theme_classic(base_size = 10) +
  scale_fill_distiller(direction = 1) +
  theme(strip.placement = "outside") +
  labs(y = "", x = "", fill = "Pearson's R")

ggsave(file.path(basedir, "figures/final/Fig1_DLBCL_correlations.pdf"))

Wutz et al. cohesin and CTCF degron data

Valid pairs analysis

N.B. as these numbers are extracted from .mcools, these are post-filtering of binned data.

wutz_metadata <- readr::read_tsv(file.path(basedir, "metadata_2021-09-27-15h-22m.tsv"), comment = "#")

sample_names <- paste(wutz_metadata$`Condition Name`, wutz_metadata$Rep, sep = "_")

stats_files <- file.path(basedir, "data/hic/", sample_names, paste0(sample_names, "_50kb_hic_stats.txt"))

stats <- lapply(stats_files, function(f){
  read.csv(f, header = FALSE, 
         col.names = c("class", "n"))
}) %>% setNames(sample_names) %>% 
  bind_rows(.id = "file")

ggplot(stats, aes(x = file, y = n)) +
  geom_col() + 
  coord_flip() +
  scale_y_continuous("Sequencing depth", labels = scales::unit_format(unit = "million", scale = 1e-6)) +
  theme_classic(base_size = 10)

stats %>% 
  arrange(n)
##                               file class         n
## 1           CTCFdegron_G1_auxin0_1 total 147993483
## 2         SCC1degron_G1_auxin180_1 total 151452441
## 3                     wt_G1_none_1 total 171927161
## 4         CTCFdegron_G1_auxin120_1 total 183429031
## 5         SCC1degron_G1_WAPLRNAi_1 total 277647794
## 6         SCC1degron_G1_PDS5RNAi_1 total 294562323
## 7           SCC1degron_G1_noRNAi_1 total 307920917
## 8           SCC1degron_G1_auxin0_1 total 359170180
## 9 SCC1degron_prometaphase_noRNAi_1 total 369224447

Because the sequencing depths vary quite a lot, I will continue with datasets downsampled to the same number of valid read pairs.

Resolution calculations

marginals_files <- file.path(basedir, "data/hic/", sample_names, paste0(sample_names, "_50kb_marginals.bed"))
names(marginals_files) <- gsub("_marginals.bed", "", basename(marginals_files))
wutz_marginals_list <- lapply(marginals_files, function(f){
  readr::read_delim(f, delim = "\t", col_names = c("chr", "start", "end", "name", "score", "strand")) %>%
    makeGRangesFromDataFrame(keep.extra.columns = TRUE)
})
wutz_marginals_df <- lapply(wutz_marginals_list, as.data.frame) %>% 
  bind_rows(.id = "file")

wutz_percent_df <- wutz_marginals_df %>%
  group_by(file) %>%
  summarise(percent = 100 * sum(score > 1000) / n()) 

# percent_df %>%
#   knitr::kable(digits = 1)

ggplot(wutz_percent_df, aes(x = file, y = percent, label = signif(percent, 3))) +
  geom_col() +
  geom_text(colour = "black", hjust = "outward", nudge_y = 1) +
  geom_hline(yintercept = 80, linetype = 2) +
  scale_y_continuous("% bins with more than 1000 valid reads", limits = c(0, 100)) +
  coord_flip() +
  theme_classic(base_size = 10) +
  labs(x = "")

At 50 kb resolution, all samples pass the Rao et al. threshold of having 80% of bins with at least 1000 reads.

CHESS results

combinations <- expand.grid(sample_names, sample_names) %>% 
  filter(Var1 != Var2)

comparisons <- c(paste0(combinations$Var1, "_vs_", combinations$Var2))

chess_files <- list.files(file.path(basedir, "data", "chess", comparisons, "downsampled"), full.names = TRUE, pattern = "chr.*txt")
names(chess_files) <- sapply(chess_files, function(path){
  parts <- tail(strsplit(path, "/")[[1]], 3)
  gsub("downsampled_|.txt", "", paste(parts, collapse = "_"))
})

chess_comparisons_wutz <- lapply(chess_files, function(f){
  read.table(f, sep = "\t", header = TRUE) 
}) %>% bind_rows(.id = "comparison") 

chess_comparisons_wutz <- chess_comparisons_wutz %>%
  extract(comparison, regex = "(.*)_vs_(.*)_(chr[[:alnum:]]+)_window([[:alnum:]]+Mb)_step([[:alnum:]]+[k|M]b)_([[:alnum:]]+kb)",
          into = c("query", "ref", "chr", "window_size", "step_size", "resolution"), 
          remove = FALSE) %>% 
  dplyr::filter(!is.na(ssim))

chess_comparisons_wutz <- left_join(chess_comparisons_wutz, dplyr::select(comparison_regions, -regions))

plot SSIM and SN violin plots for WT vs. controls, degrons, prometaphase

plotting_subset_wutz <- chess_comparisons_wutz %>% 
  dplyr::filter(query=="wt_G1_none_1") %>% 
  dplyr::filter(ref %in% c("CTCFdegron_G1_auxin0_1", "CTCFdegron_G1_auxin120_1", 
                           "SCC1degron_G1_auxin0_1", "SCC1degron_G1_auxin180_1", 
                           "SCC1degron_prometaphase_noRNAi_1")) %>% 
  dplyr::mutate(ref = case_when(
    ref == "CTCFdegron_G1_auxin0_1" ~ "CTCF degron (no auxin)",
    ref == "CTCFdegron_G1_auxin120_1" ~ "CTCF degron (auxin 2h)",
    ref == "SCC1degron_G1_auxin0_1" ~ "SCC1 degron (no auxin)",
    ref == "SCC1degron_G1_auxin180_1" ~ "SCC1 degron (auxin 3h)",
    ref == "SCC1degron_prometaphase_noRNAi_1" ~ "SCC1 degron (no auxin, prometaphase)"
  )) %>% 
  dplyr::mutate(ref = factor(ref, levels = c("CTCF degron (no auxin)", 
                                             "CTCF degron (auxin 2h)",
                                             "SCC1 degron (no auxin)",
                                             "SCC1 degron (auxin 3h)",
                                             "SCC1 degron (no auxin, prometaphase)"), 
                             ordered = TRUE))
  
Fig1_Wutz_SSIM_violin <- ggplot(plotting_subset_wutz, aes(x = ref, y = ssim)) +
  geom_violin() +
  stat_summary(fun.data = "mean_sdl",  fun.args = list(mult = 1), 
    geom = "pointrange", color = "black") +
  theme_classic(base_size = 10) + 
  coord_flip() +
  labs(x = "", y = "SSIM (WT, query)") +
  scale_x_discrete(limits = rev)
Fig1_Wutz_SSIM_violin

ggsave(file.path(basedir, "figures/final/Fig1_Wutz_SSIM_violin.pdf"))


Fig1_Wutz_SN_violin <- ggplot(plotting_subset_wutz, aes(x = ref, y = SN)) +
  geom_violin() +
  stat_summary(fun.data = "mean_sdl",  fun.args = list(mult = 1), 
    geom = "pointrange", color = "black") +
  theme_classic(base_size = 10) + 
  coord_flip() +
  labs(x = "", y = "SN (WT, query)") +
  scale_x_discrete(limits = rev)
Fig1_Wutz_SN_violin

ggsave(file.path(basedir, "figures/final/Fig1_Wutz_SN_violin.pdf"))

plot example chromosome

Fig1_Wutz_SSIM_chr2 <- plotting_subset_wutz %>%
  filter(chr=="chr2") %>% 
  mutate(group = paste0(ref, ifelse(start < 90e6, 1, 2))) %>% 
  ggplot(aes(x = start, y = ssim, colour = ref, group = group)) +
  geom_line() +
  theme_classic(base_size = 10) +
  scale_x_continuous(labels = scales::unit_format(unit = "Mb", scale = 1e-6)) +
  labs(y = "SSIM (WT, query)", x = "", colour = "") +
  scale_colour_manual(values = c("darkblue", "cornflowerblue", "darkred", "lightcoral", "grey")) +
  theme(legend.position="bottom") +
  guides(colour=guide_legend(nrow=2,byrow=TRUE))
Fig1_Wutz_SSIM_chr2

ggsave(file.path(basedir, "figures/final/Fig1_Wutz_SSIM_chr2.pdf"))


Fig1_Wutz_SSIM_chr16 <- plotting_subset_wutz %>%
  filter(chr=="chr16") %>% 
  mutate(group = paste0(ref, ifelse(start < 40e6, 1, 2))) %>% 
  ggplot(aes(x = start, y = ssim, colour = ref, group = group)) +
  geom_line() +
  theme_classic(base_size = 10) +
  scale_x_continuous(labels = scales::unit_format(unit = "Mb", scale = 1e-6)) +
  labs(y = "SSIM (WT, query)", x = "", colour = "") +
  scale_colour_manual(values = c("darkblue", "cornflowerblue", "darkred", "lightcoral", "grey")) +
  theme(legend.position="bottom") +
  guides(colour=guide_legend(nrow=2,byrow=TRUE))
Fig1_Wutz_SSIM_chr16

ggsave(file.path(basedir, "figures/final/Fig1_Wutz_SSIM_chr16.pdf"))

Coverage correlations

get_coverage <- function(regions, marginals, ...){
  gr <- subsetByOverlaps(marginals, regions)
  sum(gr$score, na.rm = TRUE)
}

comparison_regions_4Mb <- comparison_regions %>% 
  dplyr::filter(window_size == "4Mb") %>% 
  as_tibble()

regions_4Mb <- makeGRangesFromDataFrame(comparison_regions_4Mb, keep.extra.columns = TRUE)
start(regions_4Mb) <- start(regions_4Mb) + 1

comparison_regions_4Mb$regions <- as.list(split(regions_4Mb, 1:length(regions_4Mb)))

regions_w_wutz_coverage_list <- lapply(wutz_marginals_list, function(x){
  comparison_regions_4Mb %>% 
    mutate(coverage = purrr::pmap_dbl(comparison_regions_4Mb, get_coverage, marginals = x)) %>% 
    return(.)
  })

regions_w_wutz_coverage <- bind_rows(regions_w_wutz_coverage_list, .id = "sample")  %>% 
  dplyr::select(-regions) %>% 
  # mutate(coverage = log(coverage)) %>% 
  tidyr::pivot_wider(names_from = sample, values_from = coverage, names_prefix = "cov_")
regions_w_wutz_coverage <- regions_w_wutz_coverage %>% 
  dplyr::rename_with(.fn = function(x) { gsub("_50kb", "", x) })

wutz_correlations <- chess_comparisons_wutz %>% 
  dplyr::select(-comparison) %>% 
  dplyr::filter(query=="wt_G1_none_1") %>% 
  dplyr::filter(ref %in% c("CTCFdegron_G1_auxin0_1", "CTCFdegron_G1_auxin120_1", 
                           "SCC1degron_G1_auxin0_1", "SCC1degron_G1_auxin180_1", 
                           "SCC1degron_prometaphase_noRNAi_1")) %>%
  dplyr::select(-z_ssim, -SN) %>% 
  pivot_wider(names_from = ref, values_from = ssim, names_prefix = "ssim_") %>% 
  left_join(regions_w_wutz_coverage) %>% 
  dplyr::select(-c(query, chr, start, end, window_size, step_size, resolution, ID)) %>% 
  corrr::correlate() %>% 
  corrr::stretch()

wutz_correlations_subset <- wutz_correlations %>% 
  dplyr::filter(startsWith(x, "ssim") & startsWith(y, "cov")) %>% 
  dplyr::mutate(ssim = gsub("ssim_", "", x),
                cov = gsub("cov_", "", y)) %>% 
  dplyr::select(-x, -y) %>% 
  dplyr::filter(cov == "wt_G1_none_1" | ssim==cov) %>% 
  dplyr::mutate(cov = ifelse(cov == "wt_G1_none_1", "WT", "self")) %>% 
  dplyr::mutate(ssim = case_when(
    ssim == "CTCFdegron_G1_auxin0_1" ~ "CTCF degron (no auxin)",
    ssim == "CTCFdegron_G1_auxin120_1" ~ "CTCF degron (auxin 2h)",
    ssim == "SCC1degron_G1_auxin0_1" ~ "SCC1 degron (no auxin)",
    ssim == "SCC1degron_G1_auxin180_1" ~ "SCC1 degron (auxin 3h)",
    ssim == "SCC1degron_prometaphase_noRNAi_1" ~ "SCC1 degron (no auxin, prometaphase)"
  )) %>% 
  dplyr::mutate(ssim = factor(ssim, levels = c("CTCF degron (no auxin)", 
                                             "CTCF degron (auxin 2h)",
                                             "SCC1 degron (no auxin)",
                                             "SCC1 degron (auxin 3h)",
                                             "SCC1 degron (no auxin, prometaphase)"), 
                             ordered = TRUE))

Fig1_Wutz_coverage_correlations <- ggplot(wutz_correlations_subset, aes(x = ssim, y = r)) +
  geom_col() +
  facet_grid(~cov) +
  coord_flip() +
  theme_classic(base_size = 10) +
  labs(x = "", y = "Pearson's R") +
  scale_x_discrete(limits = rev)
Fig1_Wutz_coverage_correlations

ggsave(file.path(basedir, "figures/final/Fig1_Wutz_coverage_correlations.pdf"))
Fig1_Wutz_SSIM_violin + 
  Fig1_Wutz_SN_violin + scale_x_discrete(labels = rep("", 5)) +
  Fig1_Wutz_coverage_correlations + scale_x_discrete(labels = rep("", 5), limits = rev) +
  plot_layout(widths = c(1, 1, 2))

ggsave(file.path(basedir, "figures/final/Fig1_Wutz_SSIM_SN_violin_correlations_assembled.pdf"))

Rao et al. GM12878 and K562 analysis

Valid pair counts of GM12878 and K562 biological replicates

stats_files <- list.files(file.path(basedir, "data/hic/"), pattern = "*hic_stats.txt", recursive = TRUE)
rao_stats_files <- stats_files[startsWith(stats_files, "GM12878") | startsWith(stats_files, "K562")]

rao_stats <- lapply(rao_stats_files, function(f){
  read.csv(file.path(basedir, "data", "hic", f), header = FALSE, 
         col.names = c("file", "class", "n"))
}) %>% bind_rows() %>% 
  dplyr::filter(class == "valid")

rao_stats <- rao_stats %>% 
  mutate(file = gsub(".pairs", "", file))

ggplot(rao_stats, aes(x = file, y = n)) +
  geom_col() + 
  coord_flip() +
  scale_y_continuous("Number of valid pairs", labels = scales::unit_format(unit = "million", scale = 1e-6)) +
  theme_classic(base_size = 10)

Resolution calculations for GM12878 and K562 replicates

sample_names <- c(paste("GM12878", 1:9, sep = "_"),
                  paste("K562", 1:2, sep = "_"))

rao_marginals_files <- file.path(basedir, "data/hic/", sample_names, paste0(sample_names, "_25kb_marginals.bed"))
names(rao_marginals_files) <- gsub("_marginals.bed", "", basename(rao_marginals_files))
rao_marginals_list <- lapply(rao_marginals_files, function(f){
  readr::read_delim(f, delim = "\t", col_names = c("chr", "start", "end", "name", "score", "strand")) %>% 
    makeGRangesFromDataFrame(keep.extra.columns = TRUE)
})
rao_marginals_df <- lapply(rao_marginals_list, as.data.frame) %>% 
  bind_rows(.id = "file")

rao_percent_df <- rao_marginals_df %>%
  group_by(file) %>%
  summarise(percent = 100 * sum(score > 1000) / n()) 

# percent_df %>%
#   knitr::kable(digits = 1)

ggplot(rao_percent_df, aes(x = file, y = percent, label = signif(percent, 3))) +
  geom_col() +
  geom_text(colour = "black", hjust = "outward", nudge_y = 1) +
  geom_hline(yintercept = 80, linetype = 2) +
  scale_y_continuous("% bins with more than 1000 valid reads", limits = c(0, 100)) +
  coord_flip() +
  theme_classic(base_size = 10) +
  labs(x = "")

CHESS results

combinations <- expand.grid(sample_names, sample_names) %>% 
  filter(Var1 != Var2)

comparisons <- c(paste0(combinations$Var1, "_vs_", combinations$Var2))

chess_files <- list.files(file.path(basedir, "data", "chess", comparisons), full.names = TRUE, pattern = "chr.*txt")
names(chess_files) <- sapply(chess_files, function(path){
  parts <- tail(strsplit(path, "/")[[1]], 2)
  gsub(".txt", "", paste(parts, collapse = "_"))
})

chess_comparisons_rao <- lapply(chess_files, function(f){
  read.table(f, sep = "\t", header = TRUE) 
}) %>% bind_rows(.id = "comparison") %>%
    extract(comparison, regex = "(.*)_vs_(.*)_(chr[[:alnum:]]+)_window([[:alnum:]]+Mb)_step([[:alnum:]]+[k|M]b)_([[:alnum:]]+kb)",
          into = c("query", "ref", "chr", "window_size", "step_size", "resolution"), 
          remove = FALSE) %>% 
  dplyr::filter(!is.na(ssim))

chess_comparisons_rao <- left_join(chess_comparisons_rao, dplyr::select(comparison_regions, -regions))
chess_comparisons_rao_plotting_subset <- chess_comparisons_rao %>% 
  dplyr::filter(ref == "GM12878_1") %>% 
  left_join(rao_stats, by = c("query" = "file")) %>% 
  rename("n" = "valid pairs")

ggplot(chess_comparisons_rao_plotting_subset, aes(x = query, y = ssim, fill = `valid pairs`)) +
  geom_violin() +
  stat_summary(fun.data = "mean_sdl",  fun.args = list(mult = 1), 
    geom = "pointrange", color = "black") +
  theme_classic(base_size = 10) + 
  coord_flip() +
  scale_x_discrete("", limits = rev) +
  scale_fill_fermenter("Valid pairs\n(millions)", 
                       labels = scales::unit_format(unit = "", scale = 1e-6),
                       direction = 1)

ggsave(file.path(basedir, "figures/final/Fig2_Rao_SSIM_violin.pdf"))

ggplot(chess_comparisons_rao_plotting_subset, aes(x = query, y = SN, fill = `valid pairs`)) +
  geom_violin() +
  stat_summary(fun.data = "mean_sdl",  fun.args = list(mult = 1), 
    geom = "pointrange", color = "black") +
  theme_classic(base_size = 10) + 
  coord_flip() +
  scale_x_discrete("", limits = rev) +
  scale_fill_fermenter("Valid pairs\n(millions)", 
                       labels = scales::unit_format(unit = "", scale = 1e-6),
                       direction = 1)

ggsave(file.path(basedir, "figures/final/Fig2_Rao_SN_violin.pdf"))

Downsampled CHESS results

sample_names_short <- c("K562_1", "K562_2", "GM12878_3", "GM12878_5")
combinations <- expand.grid(sample_names_short, sample_names_short) %>% 
  filter(Var1 != Var2)

comparisons <- c(paste0(combinations$Var1, "_vs_", combinations$Var2))

chess_files <- list.files(file.path(basedir, "data", "chess", comparisons, "downsampled"), full.names = TRUE, pattern = "chr.*txt")
names(chess_files) <- sapply(chess_files, function(path){
  parts <- tail(strsplit(path, "/")[[1]], 3)
  gsub("downsampled_|.txt", "", paste(parts, collapse = "_"))
})

chess_comparisons_rao_downsampled <- lapply(chess_files, function(f){
  read.table(f, sep = "\t", header = TRUE) 
}) %>% bind_rows(.id = "comparison") 

chess_comparisons_rao_downsampled <- chess_comparisons_rao_downsampled %>%
  extract(comparison, regex = "(.*)_vs_(.*)_(chr[[:alnum:]]+)_window([[:alnum:]]+Mb)_step([[:alnum:]]+[k|M]b)_([[:alnum:]]+kb)",
          into = c("query", "ref", "chr", "window_size", "step_size", "resolution"), 
          remove = FALSE) %>% 
  dplyr::filter(!is.na(ssim))

chess_comparisons_rao_downsampled <- left_join(chess_comparisons_rao_downsampled, 
                                               dplyr::select(comparison_regions, -regions))

Thresholds from biological replicate comparisons

The idea is to take the top 10% of SN and bottom 10% of SSIM from comparisons of biological replicates and use these to set thresholds for comparisons of different biological conditions.

# The SSIM Z-score was calculated per chromosome, but I want it across chromosomes, so I'll recalculate this and also calculate Z-scores for SN.

chess_comparisons_rao_downsampled <- chess_comparisons_rao_downsampled %>%
  group_by(query, ref, window_size, step_size, resolution) %>%
  mutate(z_SN = (SN - mean(SN)) / sd(SN),
         z_ssim = (ssim - mean(ssim)) / sd(ssim))

chess_comparisons_rao_downsampled_dedup <- chess_comparisons_rao_downsampled %>% 
  mutate(query = factor(query, levels = sample_names_short),
         ref = factor(ref, levels = sample_names_short)) %>% 
  dplyr::filter(as.numeric(query) < as.numeric(ref)) %>% 
  mutate(query = as.character(query),
         ref = as.character(ref))

chess_comparisons_rao_downsampled_dedup <- chess_comparisons_rao_downsampled_dedup %>%
  tidyr::unite("comparison", query, ref, sep = "_vs_", remove = FALSE) 

GM12878_thresholds <- chess_comparisons_rao_downsampled_dedup %>% 
  dplyr::filter(startsWith(query, "GM12878") & startsWith(ref, "GM12878")) %>% 
  group_by(query, ref, window_size, step_size, resolution) %>% 
  summarise(SN_q90 = quantile(SN, 0.9),
            ssim_q10 = quantile(ssim, 0.1),
            SN_q80 = quantile(SN, 0.8),
            ssim_q20 = quantile(ssim, 0.2))

K562_thresholds <- chess_comparisons_rao_downsampled_dedup %>% 
  dplyr::filter(startsWith(query, "K562") & startsWith(ref, "K562")) %>% 
  group_by(query, ref, window_size, step_size, resolution) %>% 
  summarise(SN_q90 = quantile(SN, 0.9),
            ssim_q10 = quantile(ssim, 0.1),
            SN_q80 = quantile(SN, 0.8),
            ssim_q20 = quantile(ssim, 0.2))

all_thresholds <- bind_rows(GM12878_thresholds, K562_thresholds) %>% 
  tidyr::unite("comparison", query, ref, sep = "_vs_", remove = FALSE)

all_thresholds
## # A tibble: 2 x 10
## # Groups:   query, ref, window_size, step_size [2]
##   comparison query ref   window_size step_size resolution SN_q90 ssim_q10 SN_q80
##   <chr>      <chr> <chr> <chr>       <chr>     <chr>       <dbl>    <dbl>  <dbl>
## 1 GM12878_3… GM12… GM12… 2Mb         500kb     25kb        0.577    0.274  0.521
## 2 K562_1_vs… K562… K562… 2Mb         500kb     25kb        0.813    0.282  0.695
## # … with 1 more variable: ssim_q20 <dbl>

Since the thresholds are very similar, I’ll use the slightly less stringent values for this analysis.

Applying thresholds

SN_threshold <- all_thresholds %>% pull(SN_q90) %>% min()
ssim_threshold <- all_thresholds %>% pull(ssim_q10) %>%  max()
chess_comparisons_rao_downsampled_dedup %>% 
  # dplyr::filter(startsWith(query, "K562") & startsWith(ref, "GM12878")) %>% 
  ggplot(aes(x = ssim, y = SN)) +
  geom_rect(aes(xmin=-Inf, xmax=ssim_threshold, ymin=SN_threshold, ymax=Inf), fill = "lightsteelblue") +
  geom_point(size = 0.5) +
  facet_grid(query~ref) +
  theme_classic(base_size = 10) +
  geom_hline(yintercept = SN_threshold, linetype = 2) +
  geom_vline(xintercept = ssim_threshold, linetype = 2) +
  theme(panel.border = element_rect(linetype="solid", colour = "black", 
                                    fill = NA, size = rel(2)), 
        axis.line = element_blank()) +
  labs(x = "SSIM", y = "SN")

Fig2_Rao_SSIM_SN_thresholds_scatterplot <- chess_comparisons_rao_downsampled_dedup %>% 
  dplyr::filter(startsWith(query, "K562") & startsWith(ref, "GM12878")) %>% 
  ggplot(aes(x = ssim, y = SN)) +
  geom_rect(aes(xmin=-Inf, xmax=ssim_threshold, ymin=SN_threshold, ymax=Inf), fill = "lightsteelblue") +
  geom_point(size = 0.5) +
  facet_grid(query~ref) +
  theme_classic(base_size = 10) +
  geom_hline(yintercept = SN_threshold, linetype = 2) +
  geom_vline(xintercept = ssim_threshold, linetype = 2) +
  theme(panel.border = element_rect(linetype="solid", colour = "black", 
                                    fill = NA, size = rel(2)), 
        axis.line = element_blank()) +
  labs(x = "SSIM", y = "SN")

Fig2_Rao_SSIM_SN_thresholds_scatterplot

Fig2_Rao_SSIM_SN_thresholds_scatterplot

ggsave(file.path(basedir, "figures/final/Fig2_Rao_SSIM_SN_thresholds_scatterplot.pdf"))
regions_passing_thresholds <- chess_comparisons_rao_downsampled_dedup %>% 
    dplyr::filter(SN > SN_threshold & ssim < ssim_threshold)

regions_passing_thresholds
## # A tibble: 349 x 14
## # Groups:   query, ref, window_size, step_size, resolution [4]
##    query  comparison   ref    chr   window_size step_size resolution    ID    SN
##    <chr>  <chr>        <chr>  <chr> <chr>       <chr>     <chr>      <int> <dbl>
##  1 K562_1 K562_1_vs_G… GM128… chr16 2Mb         500kb     25kb        5186 0.716
##  2 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         496 0.674
##  3 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         506 1.42 
##  4 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         521 0.807
##  5 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         522 1.01 
##  6 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         526 0.957
##  7 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         527 0.898
##  8 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         528 0.845
##  9 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         537 0.671
## 10 K562_1 K562_1_vs_G… GM128… chr2  2Mb         500kb     25kb         538 0.931
## # … with 339 more rows, and 5 more variables: ssim <dbl>, z_ssim <dbl>,
## #   start <int>, end <int>, z_SN <dbl>
regions_passing_thresholds_n <- regions_passing_thresholds %>% 
  group_by(query, ref) %>% 
  summarise(n = n())

regions_passing_thresholds %>% 
  group_by(ID) %>% 
  summarise(n = n()) %>% 
  pull(n) %>% table()
## .
##  1  2  3  4 
## 20 20 27 52

Export regions of interest

for (x in unique(regions_passing_thresholds$comparison)){
  message(x)
  regions_passing_thresholds %>%
    dplyr::filter(comparison == x) %>% 
    arrange(-ssim)  %>% 
    ungroup() %>% 
    select(chr, start, end) %>% 
    unique() %>% 
    write.table(file = file.path(basedir, "data/chess/", paste0(x, "_window2Mb_step500kb_25kb_ssim10_SN90.bed")),
                col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
}

regions_passing_thresholds %>%
  group_by(ID) %>% 
  mutate(n_passing = n()) %>% 
  dplyr::filter(n_passing == 4) %>%
  arrange(-ssim) %>% 
  ungroup() %>% 
  select(chr, start, end) %>% 
  unique() %>% 
  write.table(file = file.path(basedir, "data/chess/", "GM12878_vs_K562_window2Mb_step500kb_25kb_ssim10_SN90.bed"),
              col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")

Export sample regions without changes

set.seed(42)
chess_comparisons_rao_downsampled_dedup %>% 
  dplyr::filter(!(ID %in% regions_passing_thresholds$ID)) %>% 
  ungroup() %>% 
  dplyr::slice_sample(n = 100) %>% 
  arrange(-ssim)  %>% 
  select(chr, start, end) %>% 
  unique() %>% 
  write.table(file = file.path(basedir, "data/chess/", "GM12878_vs_K562_window2Mb_step500kb_25kb_negative_set.bed"),
                col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")

negative_set <- read.table(file.path(basedir, "data/chess/", "GM12878_vs_K562_window2Mb_step500kb_25kb_negative_set.bed"),
                           col.names = c("chr", "start", "end"))

negative_set <- left_join(negative_set, comparison_regions)

chess_comparisons_rao_downsampled_dedup %>% 
  dplyr::filter(startsWith(query, "K562") & startsWith(ref, "GM12878")) %>% 
  mutate(is_negative = ifelse(ID %in% negative_set$ID, "yes", "no")) %>% 
  ggplot(aes(x = ssim, y = SN)) +
  geom_rect(aes(xmin=-Inf, xmax=ssim_threshold, ymin=SN_threshold, ymax=Inf), fill = "lightsteelblue") +
  geom_point(size = 1, aes(colour = is_negative)) +
  facet_grid(query~ref) +
  theme_classic(base_size = 10) +
  geom_hline(yintercept = SN_threshold, linetype = 2) +
  geom_vline(xintercept = ssim_threshold, linetype = 2) +
  theme(panel.border = element_rect(linetype="solid", colour = "black", 
                                    fill = NA, size = rel(2)), 
        axis.line = element_blank()) +
  labs(x = "SSIM", y = "SN") +
  scale_colour_manual(values = c("yes" = "red", "no" = "black"))

CHESS results of comparing merged replicates

comparisons <- "GM12878_all_vs_K562_all"
chess_files <- list.files(file.path(basedir, "data", "chess", comparisons), full.names = TRUE, pattern = "chr.*txt")
names(chess_files) <- sapply(chess_files, function(path){
  parts <- tail(strsplit(path, "/")[[1]], 2)
  gsub(".txt", "", paste(parts, collapse = "_"))
})

chess_comparisons_rao_merged <- lapply(chess_files, function(f){
  read.table(f, sep = "\t", header = TRUE) 
}) %>% bind_rows(.id = "comparison") %>%
    extract(comparison, regex = "(.*)_vs_(.*)_(chr[[:alnum:]]+)_window([[:alnum:]]+Mb)_step([[:alnum:]]+[k|M]b)_([[:alnum:]]+kb)",
          into = c("query", "ref", "chr", "window_size", "step_size", "resolution"), 
          remove = FALSE) %>% 
  dplyr::filter(!is.na(ssim))

 chess_comparisons_rao_merged <- left_join(chess_comparisons_rao_merged, comparison_regions)
 
 chess_comparisons_rao_merged<- chess_comparisons_rao_merged %>%
  group_by(query, ref, window_size, step_size, resolution) %>%
  mutate(z_SN = (SN - mean(SN)) / sd(SN),
         z_ssim = (ssim - mean(ssim)) / sd(ssim))

When comparing merged replicates of GM12878 and K562, 580 regions on chromosomes 2 and 16 have an SN value of more than 0.6, out of 580 (100 %).

In comparison, for the DLBCL-control comparison, 1906 regions out of 5432 or (35.1 %) of regions genome wide have an SN value of more than 0.6.

Comparing SN distributions of all samples

bind_rows(chess_comparisons_DLBCL,
          chess_comparisons_rao_merged,
          chess_comparisons_rao_downsampled_dedup) %>%
  dplyr::filter(comparison != "mixA_vs_mixB_window2Mb_step500kb_25kb") %>% 
  dplyr::mutate(comparison = case_when(
    comparison == "DLBCL_vs_control_window2Mb_step500kb_25kb" ~ "DLBCL_vs_control",
    comparison == "GM12878_all_vs_K562_all_chr16_window2Mb_step500kb_25kb" ~ "GM12878_all_vs_K562_all",
    comparison == "GM12878_all_vs_K562_all_chr2_window2Mb_step500kb_25kb" ~ "GM12878_all_vs_K562_all",
    TRUE ~ comparison
  )) %>% 
  dplyr::mutate(comparison = factor(comparison, 
                levels = c("K562_1_vs_GM12878_3",  
                           "K562_1_vs_GM12878_5", 
                           "K562_2_vs_GM12878_3",
                           "K562_2_vs_GM12878_5",
                           "GM12878_3_vs_GM12878_5",
                           "K562_1_vs_K562_2",
                           "GM12878_all_vs_K562_all", 
                           "DLBCL_vs_control"), ordered = TRUE)) %>% 
  ggplot(aes(x = comparison, y = SN)) +
  geom_violin() +
  stat_summary(fun.data = "mean_sdl",  fun.args = list(mult = 1), 
    geom = "pointrange", color = "black") +
  geom_hline(yintercept = 0.6, linetype = 2, colour = "blue") +
  theme_classic(base_size = 10) + 
  coord_flip() +
  labs(x = "")

ggsave(file.path(basedir, "figures/final/Fig2_all_SN_distributions.pdf"))

Session info

This report was generated at 14:52:03, Tue Oct 12 2021.

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
## [1] patchwork_1.1.1      GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 
## [4] IRanges_2.22.2       S4Vectors_0.26.1     BiocGenerics_0.34.0 
## [7] ggplot2_3.3.3        tidyr_1.1.3          dplyr_1.0.5         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7                matrixStats_0.59.0         
##  [3] RColorBrewer_1.1-2          rprojroot_2.0.2            
##  [5] tools_4.0.3                 backports_1.2.1            
##  [7] bslib_0.2.4                 utf8_1.2.1                 
##  [9] R6_2.5.0                    rpart_4.1-15               
## [11] Hmisc_4.5-0                 DBI_1.1.1                  
## [13] colorspace_2.0-1            nnet_7.3-15                
## [15] withr_2.4.2                 gridExtra_2.3              
## [17] tidyselect_1.1.0            compiler_4.0.3             
## [19] cli_2.5.0                   Biobase_2.48.0             
## [21] htmlTable_2.1.0             DelayedArray_0.14.1        
## [23] rtracklayer_1.48.0          labeling_0.4.2             
## [25] sass_0.3.1                  checkmate_2.0.0            
## [27] scales_1.1.1                readr_1.4.0                
## [29] corrr_0.4.3                 stringr_1.4.0              
## [31] digest_0.6.27               Rsamtools_2.4.0            
## [33] foreign_0.8-81              rmarkdown_2.7              
## [35] XVector_0.28.0              base64enc_0.1-3            
## [37] jpeg_0.1-8.1                pkgconfig_2.0.3            
## [39] htmltools_0.5.1.1           highr_0.8                  
## [41] htmlwidgets_1.5.3           rlang_0.4.11               
## [43] rstudioapi_0.13             jquerylib_0.1.3            
## [45] farver_2.1.0                generics_0.1.0             
## [47] jsonlite_1.7.2              BiocParallel_1.22.0        
## [49] RCurl_1.98-1.2              magrittr_2.0.1             
## [51] GenomeInfoDbData_1.2.3      Formula_1.2-4              
## [53] Matrix_1.3-4                munsell_0.5.0              
## [55] fansi_0.5.0                 lifecycle_1.0.0            
## [57] stringi_1.5.3               yaml_2.2.1                 
## [59] SummarizedExperiment_1.18.2 zlibbioc_1.34.0            
## [61] grid_4.0.3                  crayon_1.4.1               
## [63] lattice_0.20-41             Biostrings_2.56.0          
## [65] splines_4.0.3               hms_1.0.0                  
## [67] knitr_1.31                  pillar_1.6.1               
## [69] codetools_0.2-18            XML_3.99-0.5               
## [71] glue_1.4.2                  drat_0.1.8                 
## [73] evaluate_0.14               latticeExtra_0.6-29        
## [75] data.table_1.14.0           vctrs_0.3.8                
## [77] png_0.1-7                   gtable_0.3.0               
## [79] purrr_0.3.4                 assertthat_0.2.1           
## [81] xfun_0.22                   broom_0.7.5                
## [83] survival_3.2-9              tibble_3.1.2               
## [85] GenomicAlignments_1.24.0    cluster_2.1.1              
## [87] ellipsis_0.3.2              here_1.0.1